Part 1- Supervised Learning¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
import time
from sklearn.preprocessing import StandardScaler
pd.options.display.max_columns = None
pd.options.display.max_rows = 80
%matplotlib inline
In [2]:
data = pd.read_csv('bank.csv', sep=';')
In [3]:
data
Out[3]:
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | 261 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | 149 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | 226 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 73 | retired | married | professional.course | no | yes | no | cellular | nov | fri | 334 | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
| 41184 | 46 | blue-collar | married | professional.course | no | no | no | cellular | nov | fri | 383 | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
| 41185 | 56 | retired | married | university.degree | no | yes | no | cellular | nov | fri | 189 | 2 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
| 41186 | 44 | technician | married | professional.course | no | no | no | cellular | nov | fri | 442 | 1 | 999 | 0 | nonexistent | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | yes |
| 41187 | 74 | retired | married | professional.course | no | yes | no | cellular | nov | fri | 239 | 3 | 999 | 1 | failure | -1.1 | 94.767 | -50.8 | 1.028 | 4963.6 | no |
41188 rows × 21 columns
In [4]:
data.shape
Out[4]:
(41188, 21)
In [5]:
data.isnull().sum()
Out[5]:
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
In [6]:
data.columns
Out[6]:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
dtype='object')
Cleaning Row Names¶
In [7]:
# columns =
Q1 -- EDA - Analysis¶
In [8]:
data_corr = data.corr()
fig, ax = plt.subplots(figsize=(14,10))
sns.heatmap(data_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Greens')
plt.title('Correlation Matrix')
plt.show
Out[8]:
<function matplotlib.pyplot.show(close=None, block=None)>
Conclusions:¶
- Highest Direct Relation:
euribor3, & emp.var.rate:0.97,euribor3m & nr.employed:0.95,nr.employed & emp.var.rate:0.91. - Highest Inverse Relation:
previous & pdays:-0.59,nr.employed & previous:-0.50,euribor3m & previous:-.45.
In [9]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=45)
sns.barplot(x='job', data=data, y='age', palette='rainbow')
plt.title('Jobs Vs Age', fontsize='15')
plt.show
Out[9]:
<function matplotlib.pyplot.show(close=None, block=None)>
Q2 -- Perform the following pre-processing tasks:¶
a. Missing Value Analysis¶
In [10]:
data.isnull().sum()
Out[10]:
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
In [11]:
data.columns.unique()
Out[11]:
Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
dtype='object')
In [12]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 41188 non-null object 2 marital 41188 non-null object 3 education 41188 non-null object 4 default 41188 non-null object 5 housing 41188 non-null object 6 loan 41188 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp.var.rate 41188 non-null float64 16 cons.price.idx 41188 non-null float64 17 cons.conf.idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr.employed 41188 non-null float64 20 y 41188 non-null object dtypes: float64(5), int64(5), object(11) memory usage: 6.6+ MB
There are no null values in the data set¶
b. Label Encoding wherever required¶
In [13]:
object_cols = data.select_dtypes(include='object')
In [14]:
for col in data.select_dtypes(include=['object']):
print(col, '=' , data[col].unique(), '\n')
job = ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired' 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur' 'student'] marital = ['married' 'single' 'divorced' 'unknown'] education = ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course' 'unknown' 'university.degree' 'illiterate'] default = ['no' 'unknown' 'yes'] housing = ['no' 'yes' 'unknown'] loan = ['no' 'yes' 'unknown'] contact = ['telephone' 'cellular'] month = ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep'] day_of_week = ['mon' 'tue' 'wed' 'thu' 'fri'] poutcome = ['nonexistent' 'failure' 'success'] y = ['no' 'yes']
There are 10 Features and Target to which Label Encoding needs to be applied¶
In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [16]:
data1 = data.copy()
In [17]:
for col in object_cols:
data1[col] = le.fit_transform(data1[col])
In [18]:
data1.head()
Out[18]:
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 261 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | 7 | 1 | 3 | 1 | 0 | 0 | 1 | 6 | 1 | 149 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | 7 | 1 | 3 | 0 | 2 | 0 | 1 | 6 | 1 | 226 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 6 | 1 | 151 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | 7 | 1 | 3 | 0 | 0 | 2 | 1 | 6 | 1 | 307 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
Label Encoding Done ✅¶
In [19]:
target = data1['y']
X = data1.drop(columns=['y'])
c. Selecting important features based on Random Forest¶
In [20]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X,target)
Out[20]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [21]:
sorted_idx=rfc.feature_importances_.argsort()[::-1]
plt.figure(figsize=(10,7))
y_values = list(X.columns[sorted_idx])
sns.barplot(x=rfc.feature_importances_[sorted_idx], y=y_values, palette="rainbow")
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
e. Standardize the data using any one of the scalers provided by sklearn¶
In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
In [23]:
X_scaled = pd.DataFrame(sc.fit_transform(X), columns=X.columns)
In [24]:
X_scaled
Out[24]:
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.533034 | -0.201579 | -0.283741 | -1.753925 | -0.513600 | -1.087707 | -0.452491 | 1.31827 | 0.762558 | -0.718834 | 0.010471 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | 0.648092 | 0.722722 | 0.886447 | 0.712460 | 0.331680 |
| 1 | 1.628993 | 0.911227 | -0.283741 | -0.349730 | 1.945327 | -1.087707 | -0.452491 | 1.31827 | 0.762558 | -0.718834 | -0.421501 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | 0.648092 | 0.722722 | 0.886447 | 0.712460 | 0.331680 |
| 2 | -0.290186 | 0.911227 | -0.283741 | -0.349730 | -0.513600 | 0.942127 | -0.452491 | 1.31827 | 0.762558 | -0.718834 | -0.124520 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | 0.648092 | 0.722722 | 0.886447 | 0.712460 | 0.331680 |
| 3 | -0.002309 | -1.036184 | -0.283741 | -1.285860 | -0.513600 | -1.087707 | -0.452491 | 1.31827 | 0.762558 | -0.718834 | -0.413787 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | 0.648092 | 0.722722 | 0.886447 | 0.712460 | 0.331680 |
| 4 | 1.533034 | 0.911227 | -0.283741 | -0.349730 | -0.513600 | -1.087707 | 2.311440 | 1.31827 | 0.762558 | -0.718834 | 0.187888 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | 0.648092 | 0.722722 | 0.886447 | 0.712460 | 0.331680 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41183 | 3.164336 | 0.354824 | -0.283741 | 0.586399 | -0.513600 | 0.942127 | -0.452491 | -0.75857 | 1.193593 | -1.434368 | 0.292025 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | -0.752343 | 2.058168 | -2.224953 | -1.495186 | -2.815697 |
| 41184 | 0.573445 | -0.757982 | -0.283741 | 0.586399 | -0.513600 | -1.087707 | -0.452491 | -0.75857 | 1.193593 | -1.434368 | 0.481012 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | -0.752343 | 2.058168 | -2.224953 | -1.495186 | -2.815697 |
| 41185 | 1.533034 | 0.354824 | -0.283741 | 1.054464 | -0.513600 | 0.942127 | -0.452491 | -0.75857 | 1.193593 | -1.434368 | -0.267225 | -0.204909 | 0.195414 | -0.349494 | 0.192622 | -0.752343 | 2.058168 | -2.224953 | -1.495186 | -2.815697 |
| 41186 | 0.381527 | 1.467630 | -0.283741 | 0.586399 | -0.513600 | -1.087707 | -0.452491 | -0.75857 | 1.193593 | -1.434368 | 0.708569 | -0.565922 | 0.195414 | -0.349494 | 0.192622 | -0.752343 | 2.058168 | -2.224953 | -1.495186 | -2.815697 |
| 41187 | 3.260295 | 0.354824 | -0.283741 | 0.586399 | -0.513600 | 0.942127 | -0.452491 | -0.75857 | 1.193593 | -1.434368 | -0.074380 | 0.156105 | 0.195414 | 1.671136 | -2.563098 | -0.752343 | 2.058168 | -2.224953 | -1.495186 | -2.815697 |
41188 rows × 20 columns
In [25]:
target
Out[25]:
0 0
1 0
2 0
3 0
4 0
..
41183 1
41184 0
41185 0
41186 1
41187 0
Name: y, Length: 41188, dtype: int32
In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)
In [27]:
rfc2 = RandomForestClassifier()
rfc2.fit(X_train, y_train)
Out[27]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [28]:
sorted_idx=rfc2.feature_importances_.argsort()[::-1]
plt.figure(figsize=(10,7))
y_values = list(X_scaled.columns[sorted_idx])
sns.barplot(x=rfc2.feature_importances_[sorted_idx], y=y_values, palette="rainbow")
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
rfc2.feature_importances_
Out[28]:
array([0.09116632, 0.0485781 , 0.02355482, 0.04346493, 0.00895397,
0.02018895, 0.01539676, 0.01034687, 0.01891394, 0.04156039,
0.31438321, 0.04238978, 0.03515509, 0.01317983, 0.02577809,
0.02439972, 0.02175246, 0.02815069, 0.10868429, 0.06400179])
Q.3 & Q.4 Build the following Supervised Learning models:¶
a. Logistic Regression¶
In [29]:
from sklearn.linear_model import LogisticRegression
lc = LogisticRegression(random_state=0, max_iter=10000)
lc.fit(X_train,y_train)
Out[29]:
LogisticRegression(max_iter=10000, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=10000, random_state=0)
In [30]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score
y_pred = lc.predict(X_test)
print(confusion_matrix(y_test,y_pred))
[[7108 195] [ 542 393]]
In [31]:
acc = accuracy_score(y_test,y_pred) * 100
print("Accuracy score is {}%".format(round(acc,2)))
Accuracy score is 91.05%
Applying Cross Validation Score¶
In [32]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(LogisticRegression(),X_scaled,target,cv=10,scoring='accuracy'))
Out[32]:
0.8317141827617217
b. Decision Trees¶
In [33]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=42)
In [35]:
dtc.fit(X_train, y_train)
Out[35]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [36]:
y_pred = dtc.predict(X_test)
In [37]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test, y_pred)
Out[37]:
0.8885651857246905
In [38]:
confusion_matrix(y_test,y_pred)
Out[38]:
array([[6843, 460],
[ 458, 477]], dtype=int64)
c. Random Forest¶
In [39]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_features=7, max_depth=8, n_jobs=-1)
In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.2, random_state=0)
In [41]:
rfc.fit(X_train, y_train)
Out[41]:
RandomForestClassifier(max_depth=8, max_features=7, n_jobs=-1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=8, max_features=7, n_jobs=-1)
In [42]:
from sklearn.metrics import accuracy_score
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)
Out[42]:
0.9217042971595047
Part 2 – Unsupervised Learning¶
In [43]:
credit = pd.read_csv('credit_card.csv')
In [44]:
credit.head()
Out[44]:
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
In [45]:
credit.shape
Out[45]:
(8950, 18)
Q1. Perform EDA on the given data. What does the primary analysis of several Numeric features reveal?¶
In [46]:
credit_corr = credit.corr()
fig, ax = plt.subplots(figsize=(24,20))
sns.heatmap(credit_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Blues')
plt.title('Correlation Matrix')
plt.show
Out[46]:
<function matplotlib.pyplot.show(close=None, block=None)>
Conclusions:¶
- Highest Direct Relation:
ONEOFF_PURCHASES, & :PURCHASES0.92,PURCHASES_TRX & PURCHASES_TRX :0.69
In [47]:
credit.columns
Out[47]:
Index(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
dtype='object')
In [48]:
credit.select_dtypes(include='object')
credit = credit.drop(columns='CUST_ID')
In [49]:
sns.pairplot(data=credit)
plt.show()
Q2. Perform the following Exploratory Data Analysis tasks:¶
a. Missing Value Analysis¶
In [50]:
credit.isna().sum()
Out[50]:
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
In [51]:
credit['MINIMUM_PAYMENTS'].value_counts(dropna=False)
Out[51]:
NaN 313
299.351881 2
150.317143 1
271.528169 1
6404.855484 1
...
181.773223 1
711.894455 1
256.522546 1
127.799107 1
88.288956 1
Name: MINIMUM_PAYMENTS, Length: 8637, dtype: int64
In [52]:
credit.describe()
Out[52]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
There are 2 Parameter which has NaN values.¶
In [53]:
credit['MINIMUM_PAYMENTS'] = credit['MINIMUM_PAYMENTS'].fillna(credit['MINIMUM_PAYMENTS'].mean())
In [54]:
credit['MINIMUM_PAYMENTS'].isna().sum()
Out[54]:
0
Dropping 1 Column of Credit¶
In [55]:
credit = credit.dropna()
In [56]:
credit.isna().sum()
Out[56]:
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
b. Outlier Treatment using the Z-score method¶
In [57]:
credit.describe()
Out[57]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 | 8949.000000 |
| mean | 1564.647593 | 0.877350 | 1003.316936 | 592.503572 | 411.113579 | 978.959616 | 0.490405 | 0.202480 | 0.364478 | 0.135141 | 3.249078 | 14.711476 | 4494.449450 | 1733.336511 | 864.301501 | 0.153732 | 11.517935 |
| std | 2081.584016 | 0.236798 | 2136.727848 | 1659.968851 | 904.378205 | 2097.264344 | 0.401360 | 0.298345 | 0.397451 | 0.200132 | 6.824987 | 24.858552 | 3638.815725 | 2895.168146 | 2330.700932 | 0.292511 | 1.337134 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.365782 | 0.888889 | 39.800000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.282850 | 170.875613 | 0.000000 | 12.000000 |
| 50% | 873.680279 | 1.000000 | 361.490000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 857.062706 | 335.657631 | 0.000000 | 12.000000 |
| 75% | 2054.372848 | 1.000000 | 1110.170000 | 577.830000 | 468.650000 | 1113.868654 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.279320 | 864.206542 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
In [58]:
from scipy.stats import zscore
upd_credit = credit[(np.abs(zscore(credit)) < 3).all(axis=1)]
In [59]:
upd_credit.head()
Out[59]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
c. Deal with correlated variables.¶
In [60]:
credit_new_corr = credit.corr()
fig, ax = plt.subplots(figsize=(24,20))
sns.heatmap(credit_new_corr, annot=True, fmt='0.2f', linewidths=0.5, cmap='Blues')
plt.title('Correlation Matrix')
plt.show()
Q3. Perform dimensionality reduction using PCA such that the 95% of the variance is explained¶
In [61]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
final_data = pd.DataFrame(sc.fit_transform(upd_credit), columns=upd_credit.columns)
In [62]:
final_data.head()
Out[62]:
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.809652 | -0.334911 | -0.641448 | -0.543505 | -0.448067 | -0.543984 | -0.805098 | -0.668791 | -0.698321 | -0.690806 | -0.587319 | -0.665235 | -1.021765 | -0.762853 | -0.577804 | -0.531523 | 0.312187 |
| 1 | 1.273272 | 0.091127 | -0.734487 | -0.543505 | -0.636612 | 4.671914 | -1.225631 | -0.668791 | -0.910579 | 0.883825 | 0.427706 | -0.797517 | 1.022039 | 1.953121 | 0.623747 | 0.232637 | 0.312187 |
| 2 | 0.807271 | 0.517164 | 0.019551 | 0.430864 | -0.636612 | -0.543984 | 1.297564 | 2.808268 | -0.910579 | -0.690806 | -0.587319 | -0.003826 | 1.192356 | -0.470272 | 0.050485 | -0.531523 | 0.312187 |
| 3 | 0.261448 | -1.186986 | 0.727420 | 1.345575 | -0.636612 | -0.377388 | -1.015366 | -0.379037 | -0.910579 | -0.165931 | -0.333563 | -0.731376 | 1.192356 | -0.903344 | 0.355657 | -0.531523 | 0.312187 |
| 4 | -0.297867 | 0.517164 | -0.718883 | -0.523341 | -0.636612 | -0.543984 | -1.015366 | -0.379037 | -0.910579 | -0.690806 | -0.587319 | -0.731376 | -0.953638 | -0.431099 | -0.442194 | -0.531523 | 0.312187 |
Transformed data before applying PCA¶
In [63]:
from sklearn.decomposition import PCA
pca = PCA()
pca_data = pca.fit_transform(final_data)
In [64]:
explained_variance = pca.explained_variance_ratio_
explained_variance
Out[64]:
array([2.89681469e-01, 2.03617595e-01, 9.53162936e-02, 7.70335753e-02,
6.31629683e-02, 5.38697893e-02, 4.67017544e-02, 3.81408019e-02,
3.45633100e-02, 2.83042968e-02, 1.81173944e-02, 1.72817431e-02,
1.41427165e-02, 1.14430418e-02, 6.35293736e-03, 2.26833379e-03,
1.98008996e-06])
In [65]:
np.cumsum((pca.explained_variance_ratio_))
Out[65]:
array([0.28968147, 0.49329906, 0.58861536, 0.66564893, 0.7288119 ,
0.78268169, 0.82938344, 0.86752425, 0.90208756, 0.93039185,
0.94850925, 0.96579099, 0.97993371, 0.99137675, 0.99772969,
0.99999802, 1. ])
In [66]:
pca_2 = PCA(n_components = 12)
final_df = pca_2.fit_transform(final_data)
In [67]:
final_df
Out[67]:
array([[-1.32562517, -2.04662581, 0.20416784, ..., -0.1254455 ,
0.03903563, -0.28149219],
[-2.78107354, 3.31127858, 0.44440901, ..., 1.49002356,
1.29672022, -1.88447238],
[ 1.23742926, 0.54476019, 1.47222511, ..., 0.99620197,
-0.77998389, -0.45164322],
...,
[-2.57473066, -1.14053369, 0.46583268, ..., -0.1672788 ,
-0.21099164, -0.28364622],
[-0.11663847, -1.95458381, -1.11373905, ..., 0.24033441,
-0.12742849, 0.26632671],
[-2.51578369, -0.8759746 , 0.50195631, ..., -0.39762079,
-0.82680118, 0.38732394]])
In [68]:
final_df = pd.DataFrame(final_df)
In [69]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
start = time.time()
wcss = []
for i in range(1, 11):
kmean = KMeans(n_clusters = i, random_state = 42)
kmean.fit(final_df)
wcss.append(kmean.inertia_)
plt.plot(range(1, 11), wcss, 'rx-')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
print(time.time() - start)
11.518290519714355
In [70]:
start = time.time()
silhouette_avg = []
for i in range(2, 11):
kmean = KMeans(n_clusters = i, random_state = 42)
kmean.fit(final_df)
silhouette_avg.append(silhouette_score(final_df, kmean.labels_))
plt.plot(range(2, 11), silhouette_avg, 'bx-')
plt.title('Silhouette Analysis')
plt.xlabel('Values of K')
plt.ylabel('Silhouette Score')
plt.show()
print(time.time() - start)
17.55561375617981
There are 3 clusters¶
In [71]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(final_df)
In [72]:
final_df['clusters'] = clusters
In [73]:
final_df['clusters'].value_counts()
Out[73]:
1 4177 2 1791 0 1466 Name: clusters, dtype: int64